{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import libraries and environment variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nest_asyncio\n",
    "import random\n",
    "\n",
    "nest_asyncio.apply()\n",
    "from dotenv import load_dotenv\n",
    "\n",
    "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
    "from llama_index.core.prompts import PromptTemplate\n",
    "\n",
    "from llama_index.core.evaluation import (\n",
    "    DatasetGenerator,\n",
    "    FaithfulnessEvaluator,\n",
    "    RelevancyEvaluator\n",
    ")\n",
    "from llama_index.llms.openai import OpenAI\n",
    "from llama_index.core import Settings\n",
    "\n",
    "import openai\n",
    "import time\n",
    "import os\n",
    "load_dotenv()\n",
    "openai.api_key = os.getenv(\"OPENAI_API_KEY\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read Docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = \"../data\"\n",
    "documents = SimpleDirectoryReader(data_dir).load_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create evaluation questions and pick k out of them"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_eval_questions = 25\n",
    "\n",
    "eval_documents = documents[0:20]\n",
    "data_generator = DatasetGenerator.from_documents(eval_documents)\n",
    "eval_questions = data_generator.generate_questions_from_nodes()\n",
    "k_eval_questions = random.sample(eval_questions, num_eval_questions)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Define metrics evaluators and modify llama_index faithfullness evaluator prompt to rely on the context "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We will use GPT-4 for evaluating the responses\n",
    "gpt4 = OpenAI(temperature=0, model=\"gpt-4o\")\n",
    "\n",
    "# Set appropriate settings for the LLM\n",
    "Settings.llm = gpt4\n",
    "\n",
    "# Define Faithfulness Evaluators which are based on GPT-4\n",
    "faithfulness_gpt4 = FaithfulnessEvaluator()\n",
    "\n",
    "faithfulness_new_prompt_template = PromptTemplate(\"\"\" Please tell if a given piece of information is directly supported by the context.\n",
    "    You need to answer with either YES or NO.\n",
    "    Answer YES if any part of the context explicitly supports the information, even if most of the context is unrelated. If the context does not explicitly support the information, answer NO. Some examples are provided below.\n",
    "\n",
    "    Information: Apple pie is generally double-crusted.\n",
    "    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.\n",
    "    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.\n",
    "    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).\n",
    "    Answer: YES\n",
    "\n",
    "    Information: Apple pies taste bad.\n",
    "    Context: An apple pie is a fruit pie in which the principal filling ingredient is apples.\n",
    "    Apple pie is often served with whipped cream, ice cream ('apple pie à la mode'), custard, or cheddar cheese.\n",
    "    It is generally double-crusted, with pastry both above and below the filling; the upper crust may be solid or latticed (woven of crosswise strips).\n",
    "    Answer: NO\n",
    "\n",
    "    Information: Paris is the capital of France.\n",
    "    Context: This document describes a day trip in Paris. You will visit famous landmarks like the Eiffel Tower, the Louvre Museum, and Notre-Dame Cathedral.\n",
    "    Answer: NO\n",
    "\n",
    "    Information: {query_str}\n",
    "    Context: {context_str}\n",
    "    Answer:\n",
    "\n",
    "    \"\"\")\n",
    "\n",
    "faithfulness_gpt4.update_prompts({\"your_prompt_key\": faithfulness_new_prompt_template}) # Update the prompts dictionary with the new prompt template\n",
    "\n",
    "# Define Relevancy Evaluators which are based on GPT-4\n",
    "relevancy_gpt4 = RelevancyEvaluator()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Function to evaluate metrics for each chunk size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define function to calculate average response time, average faithfulness and average relevancy metrics for given chunk size\n",
    "# We use GPT-3.5-Turbo to generate response and GPT-4 to evaluate it.\n",
    "def evaluate_response_time_and_accuracy(chunk_size, eval_questions):\n",
    "    \"\"\"\n",
    "    Evaluate the average response time, faithfulness, and relevancy of responses generated by GPT-3.5-turbo for a given chunk size.\n",
    "    \n",
    "    Parameters:\n",
    "    chunk_size (int): The size of data chunks being processed.\n",
    "    \n",
    "    Returns:\n",
    "    tuple: A tuple containing the average response time, faithfulness, and relevancy metrics.\n",
    "    \"\"\"\n",
    "\n",
    "    total_response_time = 0\n",
    "    total_faithfulness = 0\n",
    "    total_relevancy = 0\n",
    "\n",
    "    # create vector index\n",
    "    llm = OpenAI(model=\"gpt-3.5-turbo\")\n",
    "\n",
    "    Settings.llm = llm\n",
    "    Settings.chunk_size = chunk_size\n",
    "    Settings.chunk_overlap = chunk_size // 5 \n",
    "\n",
    "    vector_index = VectorStoreIndex.from_documents(eval_documents)\n",
    "    \n",
    "    # build query engine\n",
    "    query_engine = vector_index.as_query_engine(similarity_top_k=5)\n",
    "    num_questions = len(eval_questions)\n",
    "\n",
    "    # Iterate over each question in eval_questions to compute metrics.\n",
    "    # While BatchEvalRunner can be used for faster evaluations (see: https://docs.llamaindex.ai/en/latest/examples/evaluation/batch_eval.html),\n",
    "    # we're using a loop here to specifically measure response time for different chunk sizes.\n",
    "    for question in eval_questions:\n",
    "        start_time = time.time()\n",
    "        response_vector = query_engine.query(question)\n",
    "        elapsed_time = time.time() - start_time\n",
    "        \n",
    "        faithfulness_result = faithfulness_gpt4.evaluate_response(\n",
    "            response=response_vector\n",
    "        ).passing\n",
    "        \n",
    "        relevancy_result = relevancy_gpt4.evaluate_response(\n",
    "            query=question, response=response_vector\n",
    "        ).passing\n",
    "\n",
    "        total_response_time += elapsed_time\n",
    "        total_faithfulness += faithfulness_result\n",
    "        total_relevancy += relevancy_result\n",
    "\n",
    "    average_response_time = total_response_time / num_questions\n",
    "    average_faithfulness = total_faithfulness / num_questions\n",
    "    average_relevancy = total_relevancy / num_questions\n",
    "\n",
    "    return average_response_time, average_faithfulness, average_relevancy"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Test different chunk sizes "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\N7\\AppData\\Local\\Temp\\ipykernel_22672\\1178342312.py:21: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.\n",
      "  service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size, chunk_overlap=chunk_size//5)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Chunk size 128 - Average Response time: 1.35s, Average Faithfulness: 1.00, Average Relevancy: 1.00\n",
      "Chunk size 256 - Average Response time: 1.31s, Average Faithfulness: 1.00, Average Relevancy: 1.00\n"
     ]
    }
   ],
   "source": [
    "chunk_sizes = [128, 256]\n",
    "\n",
    "for chunk_size in chunk_sizes:\n",
    "  avg_response_time, avg_faithfulness, avg_relevancy = evaluate_response_time_and_accuracy(chunk_size, k_eval_questions)\n",
    "  print(f\"Chunk size {chunk_size} - Average Response time: {avg_response_time:.2f}s, Average Faithfulness: {avg_faithfulness:.2f}, Average Relevancy: {avg_relevancy:.2f}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
